{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b478298-3439-45b5-8242-e7151c86ee94",
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "from requests.adapters import HTTPAdapter, Retry\n",
    "from openpyxl import Workbook\n",
    "import time\n",
    "import pyautogui\n",
    "import pandas as pd\n",
    "from urllib.parse import quote\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e402a408-0f8e-4fa0-844e-c70cf3035d3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ----------------------------\n",
    "# Helpers\n",
    "# ----------------------------\n",
    "def make_session():\n",
    "    s = requests.Session()\n",
    "    retries = Retry(\n",
    "        total=5,\n",
    "        backoff_factor=0.6,\n",
    "        status_forcelist=[429, 500, 502, 503, 504],\n",
    "        allowed_methods=[\"GET\"],\n",
    "        raise_on_status=False,\n",
    "    )\n",
    "    s.mount(\"https://\", HTTPAdapter(max_retries=retries))\n",
    "    s.mount(\"http://\", HTTPAdapter(max_retries=retries))\n",
    "    s.headers.update({\n",
    "        \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n",
    "                      \"AppleWebKit/537.36 (KHTML, like Gecko) \"\n",
    "                      \"Chrome/120.0.0.0 Safari/537.36\"\n",
    "    })\n",
    "    return s\n",
    "\n",
    "def clean_text(txt: str) -> str:\n",
    "    if not txt:\n",
    "        return \"\"\n",
    "    txt = re.sub(r\"\\s+\", \" \", txt).strip()\n",
    "    return txt\n",
    "\n",
    "def pick_article_url(info_links):\n",
    "    \"\"\"\n",
    "    NAVER search result has multiple 'a.info' links.\n",
    "    Prefer Naver News (n.news.naver.com) if present.\n",
    "    Otherwise fall back to second link if exists.\n",
    "    \"\"\"\n",
    "    hrefs = [a.get(\"href\", \"\") for a in info_links if a.get(\"href\")]\n",
    "    for h in hrefs:\n",
    "        if \"n.news.naver.com\" in h:\n",
    "            return h\n",
    "    if len(hrefs) >= 2:\n",
    "        return hrefs[1]\n",
    "    if len(hrefs) >= 1:\n",
    "        return hrefs[0]\n",
    "    return None\n",
    "\n",
    "def parse_article(session, url, company_text=\"\"):\n",
    "    \"\"\"\n",
    "    Parse NAVER news article pages and return (title, content, date_ym).\n",
    "    Keep it modest and robust across common layouts.\n",
    "    \"\"\"\n",
    "    r = session.get(url, timeout=10)\n",
    "    if r.status_code != 200:\n",
    "        return None, None, None\n",
    "\n",
    "    soup = BeautifulSoup(r.text, \"html.parser\")\n",
    "\n",
    "    # Title: try common selectors\n",
    "    title_el = (\n",
    "        soup.select_one(\"h2#title_area\") or\n",
    "        soup.select_one(\"h2.media_end_head_headline\") or\n",
    "        soup.select_one(\"h1\")  # fallback\n",
    "    )\n",
    "\n",
    "    # Content: try common selectors\n",
    "    content_el = (\n",
    "        soup.select_one(\"div#dic_area\") or\n",
    "        soup.select_one(\"div#newsct_article\") or\n",
    "        soup.select_one(\"div#articeBody\") or  # legacy typo in some old templates\n",
    "        soup.select_one(\"div#articleBodyContents\")  # older template\n",
    "    )\n",
    "\n",
    "    # Date: try common selectors\n",
    "    date_el = (\n",
    "        soup.select_one(\"span.media_end_head_info_datestamp_time\") or\n",
    "        soup.select_one(\"span._ARTICLE_DATE_TIME\") or\n",
    "        soup.select_one(\"span.t11\")  # older\n",
    "    )\n",
    "\n",
    "    title = clean_text(title_el.get_text(\" \", strip=True)) if title_el else \"\"\n",
    "    content = clean_text(content_el.get_text(\" \", strip=True)) if content_el else \"\"\n",
    "    date_txt = clean_text(date_el.get_text(\" \", strip=True)) if date_el else \"\"\n",
    "\n",
    "    # extract YYYY-MM (or YYYY.MM) robustly\n",
    "    ym = \"\"\n",
    "    m = re.search(r\"(20\\d{2})[.\\-/년 ]\\s*(\\d{1,2})\", date_txt)\n",
    "    if m:\n",
    "        ym = f\"{m.group(1)}-{int(m.group(2)):02d}\"\n",
    "\n",
    "    return title, content, ym\n",
    "\n",
    "# ----------------------------\n",
    "# Inputs\n",
    "# ----------------------------\n",
    "keyword = pyautogui.prompt(\"insert keyword\")\n",
    "lastpage = int(pyautogui.prompt(\"How many pages?\"))\n",
    "\n",
    "# ----------------------------\n",
    "# Workbook\n",
    "# ----------------------------\n",
    "wb = Workbook()\n",
    "ws = wb.active\n",
    "ws.append([\"link\", \"title\", \"contents\", \"date_ym\", \"company\"])\n",
    "\n",
    "ws.column_dimensions[\"A\"].width = 60\n",
    "ws.column_dimensions[\"B\"].width = 60\n",
    "ws.column_dimensions[\"C\"].width = 120\n",
    "ws.column_dimensions[\"D\"].width = 20\n",
    "ws.column_dimensions[\"E\"].width = 30\n",
    "\n",
    "# ----------------------------\n",
    "# Scrape loop\n",
    "# ----------------------------\n",
    "session = make_session()\n",
    "seen_urls = set()\n",
    "\n",
    "row = 2\n",
    "page_num = 1\n",
    "\n",
    "# NOTE: start parameter increments by 10 (1,11,21,...) in NAVER search\n",
    "for start in range(1, lastpage * 10, 10):\n",
    "    print(f\"[Page {page_num}] scraping start={start} =========================================\")\n",
    "\n",
    "    q = quote(keyword)\n",
    "    search_url = (\n",
    "        \"https://search.naver.com/search.naver\"\n",
    "        f\"?where=news&sm=tab_pge&query={q}\"\n",
    "        \"&sort=0&photo=0&field=0&pd=3\"\n",
    "        \"&ds=2012.01.01&de=2012.01.30\"\n",
    "        \"&nso=so:r,p:from20120101to20120130,a:all\"\n",
    "        f\"&start={start}\"\n",
    "    )\n",
    "\n",
    "    resp = session.get(search_url, timeout=10)\n",
    "    if resp.status_code != 200:\n",
    "        print(f\"  - search page failed: {resp.status_code}\")\n",
    "        page_num += 1\n",
    "        continue\n",
    "\n",
    "    soup = BeautifulSoup(resp.text, \"html.parser\")\n",
    "    articles = soup.select(\"div.news_area\") or soup.select(\"div.info_group\")\n",
    "\n",
    "    for article in articles:\n",
    "        # press / company\n",
    "        company_el = article.select_one(\"a.info.press\")\n",
    "        company = clean_text(company_el.get_text(\" \", strip=True)) if company_el else \"\"\n",
    "\n",
    "        info_links = article.select(\"a.info\")\n",
    "        url = pick_article_url(info_links)\n",
    "        if not url:\n",
    "            continue\n",
    "\n",
    "        # --- URL de-dup ---\n",
    "        if url in seen_urls:\n",
    "            continue\n",
    "        seen_urls.add(url)\n",
    "\n",
    "        title, content, ym = parse_article(session, url, company_text=company)\n",
    "\n",
    "        # Skip if parsing failed or empty content\n",
    "        if not title or not content:\n",
    "            continue\n",
    "\n",
    "        ws[f\"A{row}\"] = url\n",
    "        ws[f\"B{row}\"] = title\n",
    "        ws[f\"C{row}\"] = content\n",
    "        ws[f\"D{row}\"] = ym\n",
    "        ws[f\"E{row}\"] = company\n",
    "\n",
    "        row += 1\n",
    "        time.sleep(0.4)  # be polite\n",
    "\n",
    "    page_num += 1\n",
    "    time.sleep(0.7)\n",
    "\n",
    "# ----------------------------\n",
    "# Optional: save\n",
    "# ----------------------------\n",
    "out_path = \"naver_news_scrape.xlsx\"\n",
    "wb.save(out_path)\n",
    "print(f\"Saved: {out_path} (rows={row-2}, unique_urls={len(seen_urls)})\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15d47989-6f84-4008-ac51-770da9f218d1",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9c2e508-14fa-4bd5-a595-2c8586e94f3c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
